#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-06-14 14:00:06
# Project: dxy_heart_3

from pyspider.libs.base_handler import *
import re

class Handler(BaseHandler):
    crawl_config = {
        "proxy":"111.62.251.72:80",
    } 

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://heart.dxy.cn/tag/news/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
              if re.match("http://heart.dxy.cn/article/\w+", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.detail_page)
        # 翻页
        for each in response.doc('.x_page1 > a').items():
            self.crawl(each.attr.href, callback=self.index_page)
    
    @config(priority=2)
    def detail_page(self, response):
        title = response.doc('title').text()
       # tt = self.remove_uni(title)
        content = response.doc('.article__detail').text()
      #  cc = self.remove_uni(content)
        return {
            "url": response.url,
            "title": title,
            "content":content,
        }
